{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import T5Tokenizer, T5ForConditionalGeneration\n",
    "\n",
    "tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['finetune-t5-small-standard-bahasa-cased-v2/checkpoint-460000',\n",
       " 'finetune-t5-small-standard-bahasa-cased-v2/checkpoint-470000']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "checkpoints = sorted(glob('finetune-t5-small-standard-bahasa-cased-v2/checkpoint-*'))\n",
    "checkpoints"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "596"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from unidecode import unidecode\n",
    "import re\n",
    "\n",
    "# minimum cleaning, just simply to remove newlines.\n",
    "def cleaning(string):\n",
    "    string = string.replace('\\n', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string\n",
    "\n",
    "s = \"\"\"\n",
    "BERA: The eldest son of former prime minister Datuk Seri Najib Razak, Datuk Nizar Mohd Najib, has hinted that he might be fielded in one of the four state seats under the Pekan parliamentary constituency.\n",
    "\n",
    "The 44-year-old Pekan Umno Youth division chief said he has a feeling that he will be making his general election debut at one of the state seats.\n",
    "\n",
    "He will wait for the final announcement from the Barisan Nasional (BN) leadership.\n",
    "\n",
    "\"My name has been nominated for both the Pekan parliamentary and a state seat, but I do not know (where I will contest). I will leave it to the party's leadership to decide.\n",
    "\n",
    "On possible talks where he might be fielded in GE15, Nizar said he was not sure, but guessed that he could be fielded in a state seat.\n",
    "\n",
    "\"My guess is that it maybe be a DUN (Dewan Undangan Negeri or state constituency), but I don't know yet.\n",
    "\n",
    "\"Let's wait for the final announcement as these things can change... they change every time. We have to wait for the final outcome. The decision making is not in my hands, so we can only work hard and leave the rest to God.\"\n",
    "\n",
    "Nizar said BN's fighting spirit in Pekan was strong because it has been a stronghold for Umno and BN.\n",
    "\n",
    "He said BN wants to defend the seat and win big in Pekan and the four state seats of Peramu Jaya, Chini, Pulau Manis and Bebar.\n",
    "\n",
    "\"In Pekan, we have four state seats and if we make a clean sweep, then we have secured 14 per cent from the overall state seats in Pahang (total 42 state seats) to achieve a two-third majority. We need to get the best from our winnable state seats to ensure a win.\n",
    "\n",
    "\"If we are able to secure a huge victory in Pekan, then that will be the best and meaningful gift to Datuk Seri Najib Razak to help uplift his spirits to continue fighting as it is not easy being there (in prison) alone.\n",
    "\n",
    "\"A big victory in Pekan will certainly be a huge morale booster for him.\"\n",
    "\n",
    "Najib, who is the former Pekan member of parliament, had won the seat for the 10th term in the 14th General Election in 2018.\n",
    "\n",
    "Najib was sentenced to 12 years' imprisonment and fined RM210 million for misappropriating RM42 million SRC International funds, a former subsidiary of 1Malaysia Development Bhd.\n",
    "\n",
    "Pekan Umno division deputy chief Datuk Seri Zamri Ramly was recently quoted as saying that there was no reason to give Nizar both the state assembly and parliamentary seats as he lacks experience in politics.\n",
    "\n",
    "\"Regardless whether he is fielded at a parliamentary or state seat, Nizar will be well-accepted by voters in Pekan. If he is chosen as a candidate, Nizar will be among the new faces in Umno. However, let's wait for the party leadership to make an official announcement,\" said Zamri.\n",
    "\n",
    "Meanwhile, Pahang Umno Youth chief Mohd Farid Mohd Nor said Nizar's presence as a GE15 candidate would certainly bring victory to Umno in Pekan as he will be able to continue Najib's legacy in the constituency.\n",
    "\n",
    "It is learnt that besides Nizar, Zamri, who was appointed as the constituency's \"caretaker\" after Najib was sent to jail, might be also considered to contest in GE15.\n",
    "\n",
    "Pekan has been held by the country's second prime minister Tun Abdul Razak Hussein's family since 1959. Najib took over his father's seat in a by-election held in 1976 after the latter's death.\n",
    "\n",
    "Najib served as the Pahang menteri besar for one term between 1982 and 1986.\n",
    "\"\"\"\n",
    "\n",
    "s = cleaning(unidecode(s))\n",
    "len(s.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "170"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = \"\"\"\n",
    "PERDANA Menteri Datuk Seri Anwar Ibrahim sekali lagi mencuri tumpuan netizen selepas gambar-gambarnya menghadiri mesyuarat bersama Ketua Setiausaha Negara, Tan Sri Mohd Zuki Ali viral di media sosial.\n",
    "Menerusi satu ciapan di Twitter, beliau memuat naik beberapa keping foto yang memperlihatkan dirinya menggayakan persalinan baju Melayu berona ungu, lengkap dengan samping dan songkok.\n",
    "Selain itu, beliau turut menyarungkan sepasang capal ketika menghadiri mesyuarat berkenaan.\n",
    "\n",
    "Bagaimanapun, menerusi salah satu gambar, Anwar dilihat membuka capalnya ketika sedang bermesyuarat dan hal itu mencetus perbincangan dalam kalangan netizen.\n",
    "“Dah la pakai selipar, masa meeting boleh tanggal camtu aje,” ujar seorang netizen.\n",
    "Namun, ciapan tersebut dibalas oleh seorang lagi pengguna Twitter.\n",
    "“Untuk makluman bersama, capal dibenarkan untuk dipakai bersama baju Melayu bersamping. Aku pun baru tahu,” katanya.\n",
    "Selain itu, ada juga antara pengguna media sosial yang memberitahu yang martabat capal lebih tinggi berbanding selipar dan kasut-kasut lain.\n",
    "“Capal actually salah satu pakaian Melayu tradisional. Dipakai bangsawan dan para pahlawan pada zaman dulu kala. Jadi, martabat capal tu lebih tinggi dari selipar dan kasut-kasut lain,” katanya.\n",
    "\"\"\"\n",
    "\n",
    "s = cleaning(unidecode(s))\n",
    "len(s.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "247"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = \"\"\"\n",
    "Dato' Sri Haji Mohammad Najib bin Tun Haji Abdul Razak (Jawi: محمد نجيب بن عبد الرزاق, Malay pronunciation: [muhammad nadʒɪb]; born 23 July 1953) is a Malaysian politician who served as the 6th prime minister of Malaysia from April 2009 to May 2018. In 2020, he was convicted of corruption in the 1Malaysia Development Berhad scandal,[2] one of the largest money-laundering scandals in history.[3][4] He is the son of former prime minister Abdul Razak Hussein. Najib Razak was the chairman of the Barisan Nasional (BN) coalition from April 2009 to May 2018 and the president of the United Malays National Organisation (UMNO) from November 2008 to May 2018,[5][6][7] which had maintained control of Malaysia's government with a parliamentary majority for more than sixty years until the coalition's defeat in the 2018 general election.\n",
    "\n",
    "Najib was elected to the Parliament of Malaysia in 1976, at the age of 23, replacing his deceased father in the Pahang-based seat of Pekan. In the same year, he was appointed the head of UMNO Youth's Pekan branch and became a member of the youth wing's Executive Council. In the early years of his political career, Najib took on a deputy minister role in 1976, and between 1982 and 1986, he was the Menteri Besar of Pahang. Thereafter, until 2009, he was rotated throughout the Cabinet of Malaysia, taking on various ministerial portfolios in defence, education, culture, youth and sports, and finally finance. Between 1993 and 2009, Najib was a vice-president of UMNO.[8]\n",
    "\"\"\"\n",
    "s = cleaning(unidecode(s))\n",
    "len(s.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['English News',\n",
       " 'Zamri Ramly',\n",
       " 'Pekan',\n",
       " 'UMNO',\n",
       " 'Nizar Mohd Najib',\n",
       " 'GE15',\n",
       " 'state seats',\n",
       " 'Najib Razak',\n",
       " 'Mohd Farid Mohd Nor']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_ids = tokenizer.encode(f'10 kata kunci: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length=256,)\n",
    "t = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
    "t = list(set(t.split(',')))\n",
    "t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Tun Abdul Razak Hussein',\n",
       " 'Melayu',\n",
       " 'Pekan',\n",
       " 'Datuk Khalid',\n",
       " 'Malaysia',\n",
       " 'Councils',\n",
       " 'Najib Razak',\n",
       " 'Zamri',\n",
       " 'Somalia',\n",
       " 'Zamri Ramly',\n",
       " 'Ahmad Zahid Hamidi',\n",
       " 'Pulau Manis',\n",
       " 'UMNO',\n",
       " 'Ketua Menteri',\n",
       " 'Muhyiddin Yassin',\n",
       " 'RM27 juta',\n",
       " 'Samantha Khan',\n",
       " 'Abdul Razak Hussein',\n",
       " 'Keret',\n",
       " 'Switzerland',\n",
       " 'Peramu Jaya',\n",
       " 'Istana Negara',\n",
       " 'Sultan Abdullah',\n",
       " 'Barisan Nasional',\n",
       " 'SRC',\n",
       " 'Morality',\n",
       " 'Kempen',\n",
       " 'GE15',\n",
       " 'Australia',\n",
       " 'Kang',\n",
       " 'K-Pop',\n",
       " 'Angkatan',\n",
       " 'Mohd Farid Mohd Nor',\n",
       " 'Damascus',\n",
       " 'Stadium',\n",
       " 'Bebar',\n",
       " 'Parlimen',\n",
       " 'Developing',\n",
       " 'SRC International',\n",
       " 'Pahang',\n",
       " 'Chini',\n",
       " 'Hong Kong',\n",
       " 'Tun Abdul Razak']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_ids = tokenizer.encode(f'50 kata kunci: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, do_sample=True,\n",
    "    max_length=256,\n",
    "    top_k=0,\n",
    "    temperature=0.7)\n",
    "t = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
    "t = list(set(t.split(',')))\n",
    "t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Tun Abdul Razak Hussein',\n",
       " 'Pekan',\n",
       " 'UMNO-Barisan Nasional',\n",
       " 'Zamri Ramly',\n",
       " 'UMNO',\n",
       " 'Nizar Mohd Najib',\n",
       " 'Abdul Razak Hussein',\n",
       " 'English News',\n",
       " 'Barisan Nasional',\n",
       " 'Malaysia Memilih',\n",
       " 'GE15',\n",
       " 'malaysiamemilih',\n",
       " 'Mohd Farid Mohd Nor',\n",
       " 'penamaan',\n",
       " 'pilihan raya',\n",
       " 'pilihan raya umum',\n",
       " 'Pahang',\n",
       " 'PRU15',\n",
       " 'calon']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_ids = tokenizer.encode(f'20 kata kunci: {s}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 128, penalty_alpha=0.6, top_k=4)\n",
    "t = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
    "t = list(set(t.split(',')))\n",
    "t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/transformers/utils/hub.py:651: UserWarning: The `organization` argument is deprecated and will be removed in v5 of Transformers. Set your organization directly in the `repo_id` passed instead (`repo_id={organization}/{model_id}`).\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-keyword-t5-small-standard-bahasa-cased/commit/b872fa0745e33b92fe555608f641e91c496fd99e', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='b872fa0745e33b92fe555608f641e91c496fd99e', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.push_to_hub('finetune-keyword-t5-small-standard-bahasa-cased', organization='mesolitica')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-keyword-t5-small-standard-bahasa-cased/commit/ef819b1adca5df17116a3ca40e784907e5aa3d82', commit_message='Upload tokenizer', commit_description='', oid='ef819b1adca5df17116a3ca40e784907e5aa3d82', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.push_to_hub('finetune-keyword-t5-small-standard-bahasa-cased', organization='mesolitica')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import collections\n",
    "import string\n",
    "\n",
    "def normalize_answer(s):\n",
    "    s = s.replace('\\t', '').replace('\\n', '')\n",
    "    \n",
    "    \"\"\"Lower text and remove punctuation, articles and extra whitespace.\"\"\"\n",
    "    def remove_articles(text):\n",
    "        regex = re.compile(r'\\b(a|an|the)\\b', re.UNICODE)\n",
    "        return re.sub(regex, ' ', text)\n",
    "\n",
    "    def white_space_fix(text):\n",
    "        return ' '.join(text.split())\n",
    "\n",
    "    def remove_punc(text):\n",
    "        exclude = set(string.punctuation)\n",
    "        return ''.join(ch for ch in text if ch not in exclude)\n",
    "\n",
    "    def lower(text):\n",
    "        return text.lower()\n",
    "    return white_space_fix(remove_articles(remove_punc(lower(s))))\n",
    "\n",
    "\n",
    "def get_tokens(s):\n",
    "    if not s:\n",
    "        return []\n",
    "    return normalize_answer(s).split()\n",
    "\n",
    "\n",
    "def compute_exact(a_gold, a_pred):\n",
    "    return int(normalize_answer(a_gold) == normalize_answer(a_pred))\n",
    "\n",
    "\n",
    "def compute_f1(a_gold, a_pred):\n",
    "    gold_toks = get_tokens(a_gold)\n",
    "    pred_toks = get_tokens(a_pred)\n",
    "    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)\n",
    "    num_same = sum(common.values())\n",
    "    if len(gold_toks) == 0 or len(pred_toks) == 0:\n",
    "        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise\n",
    "        return int(gold_toks == pred_toks)\n",
    "    if num_same == 0:\n",
    "        return 0\n",
    "    precision = 1.0 * num_same / len(pred_toks)\n",
    "    recall = 1.0 * num_same / len(gold_toks)\n",
    "    f1 = (2 * precision * recall) / (precision + recall)\n",
    "    return f1\n",
    "\n",
    "def calculate_cer(actual, hyp):\n",
    "    \"\"\"\n",
    "    Calculate CER using `python-Levenshtein`.\n",
    "    \"\"\"\n",
    "    import Levenshtein as Lev\n",
    "\n",
    "    actual = actual.replace(' ', '')\n",
    "    hyp = hyp.replace(' ', '')\n",
    "    return Lev.distance(actual, hyp) / len(actual)\n",
    "\n",
    "def calculate_wer(actual, hyp):\n",
    "    \"\"\"\n",
    "    Calculate WER using `python-Levenshtein`.\n",
    "    \"\"\"\n",
    "    import Levenshtein as Lev\n",
    "\n",
    "    b = set(actual.split() + hyp.split())\n",
    "    word2char = dict(zip(b, range(len(b))))\n",
    "\n",
    "    w1 = [chr(word2char[w]) for w in actual.split()]\n",
    "    w2 = [chr(word2char[w]) for w in hyp.split()]\n",
    "\n",
    "    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "948 test.json\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l test.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "948it [08:49,  1.79it/s]\n"
     ]
    }
   ],
   "source": [
    "f1 = []\n",
    "wer, cer = [], []\n",
    "\n",
    "with open('test.json') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        data = json.loads(l)['translation']\n",
    "        s = data['src']\n",
    "        p = data['prefix']\n",
    "        tgt = data['tgt']\n",
    "        tgt = ' '.join(tgt.replace('\\t', '').split(','))\n",
    "        input_ids = tokenizer.encode(f'{p}{s}', return_tensors = 'pt')\n",
    "        outputs = model.generate(input_ids, max_length=256,)\n",
    "        t = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
    "        t = ' '.join(t.split(','))\n",
    "        f1.append(compute_f1(tgt, t))\n",
    "        wer.append(calculate_wer(tgt, t))\n",
    "        cer.append(calculate_cer(tgt, t))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.3291554473802324, 0.9415273555452611, 0.7545079579724842)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "np.mean(f1), np.mean(wer), np.mean(cer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
